home *** CD-ROM | disk | FTP | other *** search
Text File | 1990-09-13 | 44.9 KB | 1,284 lines |
- /* National Institute of Standards and Technology (NIST)
- /* National Computer System Laboratory (NCSL)
- /* Office Systems Engineering (OSE) Group
- /* ********************************************************************
- /* D I S C L A I M E R
- /* (March 8, 1989)
- /*
- /* There is no warranty for the NIST NCSL OSE SGML parser and/or the NIST
- /* NCSL OSE SGML parser validation suite. If the SGML parser and/or
- /* validation suite is modified by someone else and passed on, NIST wants
- /* the parser's recipients to know that what they have is not what NIST
- /* distributed, so that any problems introduced by others will not
- /* reflect on our reputation.
- /*
- /* Policies
- /*
- /* 1. Anyone may copy and distribute verbatim copies of the SGML source
- /* code as received in any medium.
- /*
- /* 2. Anyone may modify your copy or copies of SGML parser source code or
- /* any portion of it, and copy and distribute such modifications provided
- /* that all modifications are clearly associated with the entity that
- /* performs the modifications.
- /*
- /* NO WARRANTY
- /* ===========
- /*
- /* NIST PROVIDES ABSOLUTELY NO WARRANTY. THE SGML PARSER AND VALIDATION
- /* SUITE ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
- /* EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
- /* THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS
- /* WITH YOU. SHOULD THE SGML PARSER OR VALIDATION SUITE PROVE DEFECTIVE,
- /* YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
- /*
- /* IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL NIST BE LIABLE FOR
- /* DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL,
- /* INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR
- /* INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA
- /* BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A
- /* FAILURE OF THE PROGRAM TO OPERATE WITH PROGRAMS NOT DISTRIBUTED BY
- /* NIST) THE PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF
- /* SUCH DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
- */
-
- /************************************************************************/
- /* TITLE: SGML PARSER */
- /* SYSTEM: DOCUMENT PROCESSOR */
- /* SUBSYSTEM: */
- /* SOURCE FILE: DIGETS.C */
- /* AUTHOR: Steven Lindeman, Fred Maples */
- /* */
- /* DATE CREATED: */
- /* LAST MODIFIED: */
- /* */
- /* REVISIONS */
- /* WHEN WHO WHY */
- /************************************************************************/
- #include <stdio.h>
- #include <search.h>
- #include <ctype.h>
- #include "didefs.h"
- #include "diglobal.h"
-
- /*------------------------------------------------------*/
- /* G E T D E L I M */
- /* */
- /* Called by: GETTOKEN */
- /* */
- /* Returns: ETAGO, STAGO, PIO, MDO, */
- /* DELIM, or EOF */
- /* */
- /* reads a delimeter from input */
- /*------------------------------------------------------*/
- getdelim()
- {
- register int inchar,retval;
- switch(inchar=our_fgetc(indoc)) {
- case '<':
- switch(inchar=our_fgetc(indoc)) {
- case '/': /* found immediately after TAGO, */
- if (isalpha(inchar=our_fgetc(indoc))) {
- our_ungetc(inchar,indoc);
- putstr_outbuf("\n[/"); /* so it's an endtag */
- retval = ETAGO;
- }
- else {
- our_ungetc(inchar,indoc);
- unget_string("</");
- retval = NODELIM;
- }
- break;
- case '?': /* found a processing instruction */
- putstr_outbuf("\n[?");
- retval = PIO;
- break;
- case '!': /* found a markup declaration open */
- if ((inchar=our_fgetc(indoc)) == '-')
- if ((inchar=our_fgetc(indoc)) == '-') {
- unget_string("--");
- putstr_outbuf("<!");
- retval = MDO;
- }
- else {
- our_ungetc('-',indoc);
- our_ungetc(inchar,indoc);
- retval = NODELIM;
- }
- else
- if (isalpha(inchar) || inchar=='[' || inchar==MARKUP_END) {
- our_ungetc(inchar,indoc);
- putstr_outbuf("<!");
- retval = MDO;
- }
- else {
- our_ungetc(inchar,indoc);
- retval = NODELIM;
- }
- break;
- default:
- if (isalpha(inchar)) {
- our_ungetc(inchar,indoc);
- putstr_outbuf("\n[");
- retval = STAGO;
- }
- else {
- our_ungetc(inchar,indoc);
- our_ungetc('<',indoc);
- retval = NODELIM;
- }
- break;
- }
- break;
- case EOF:
- retval = EOF; /* no more data */
- break;
- default:
- our_ungetc(inchar,indoc);
- retval = NODELIM; /* no delimeter was found, probably data */
- break;
- }
- return(retval);
- }
-
- /*------------------------------------------------------*/
- /* G E T C D A T A */
- /* This routine reads character data from 'indoc'. */
- /* CDATA is terminated by an etago, delimiter */
- /* in context. That is a '</' followed by a name */
- /* start character. */
- /* */
- /* returns -- NFDHT, FOUND */
- /*------------------------------------------------------*/
- STATUS getcdata()
- {
- int inchar;
- unsigned num_cr;
- char *outstr;
- STATUS retval;
- register BOOLEAN more_cdata,cr_found;
- BOOLEAN firsttime;
-
- retval = NFDHT;
- outstr = get_char_mem(2);
- flush_buf();
- more_cdata = firsttime = TRUE;
- num_cr = 0;
-
- while(more_cdata==TRUE && (inchar=our_fgetc(indoc))!=EOF) {
- cr_found = save_crs(&num_cr,&inchar);
- if (inchar == '<')
- if ((inchar=our_fgetc(indoc)) == '/')
- if (isalpha(inchar=our_fgetc(indoc))) {
- more_cdata = FALSE;
- our_ungetc(inchar,indoc); /* unget in reverse order */
- unget_string("</");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&firsttime,FALSE);
- (*print_ctr)(ctrfp,"</");
- (*applic)(DATA_STG,"</","");
- our_ungetc(inchar,indoc);
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&firsttime,FALSE);
- (*put_ctr)('<',ctrfp);
- (*applic)(DATA_STG,"<","");
- our_ungetc(inchar,indoc);
- }
- else
- if (inchar == OUR_EE) {
- if (entstack[--entitylevel] != lookstack())
- ourexit(2,"\nError: Entity End occurred in different character data.\n");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&firsttime,FALSE);
- *outstr = inchar;
- (*applic)(DATA_STG,outstr,"");
- (*put_ctr)(inchar,ctrfp);
- }
- }
- if (retval == FOUND)
- (*put_ctr)('|',ctrfp);
- free(outstr);
- return(retval);
- }
-
- /*------------------------------------------------------*/
- /* G E T P C D A T A */
- /* This routine reads parsable character data */
- /* from 'indoc'. PCDATA is terminated by an */
- /* etago, delimiter in context, given that all */
- /* entities have been closed. That is, a '</' */
- /* followed by a name start character. General */
- /* entities, as well as numeric and named char- */
- /* acter references are resolved. */
- /* */
- /* returns -- NFDHT, FOUND */
- /*------------------------------------------------------*/
- STATUS getpcdata(genthead,penthead)
- ENTITYDESC *genthead,*penthead;
- {
- int inchar,token,token2;
- unsigned num_cr;
- char *outstr;
- BOOLEAN more_pcdata,more_subdata,cr_found,pcdata_ft;
- STENTRY *tp;
- TKNRETVAL tknretval;
- STATUS retval;
- TNODE *newcm;
-
- flush_buf();
- outstr = get_char_mem(2);
- num_cr = 0;
- retval = NFDHT;
- tknretval = TEXT; /* so will read all the data first */
- our_ungetc(inchar=our_fgetc(indoc),indoc); /* initialize inchar */
- token = -1;
- tp = NULL; /* just to satisfy lint */
- more_pcdata = pcdata_ft = TRUE;
-
- while(more_pcdata && inchar!=EOF) {
- if (open_rcdata_ms)
- retval = getrcdata(genthead,FALSE,&pcdata_ft,FALSE);
- else
- if (open_cdata_ms)
- retval = get_cdata_ms(&pcdata_ft);
- else
- if (tknretval == TEXT) {
- more_subdata = TRUE;
- while(more_subdata && (inchar=our_fgetc(indoc))!=EOF) {
- cr_found = save_crs(&num_cr,&inchar);
- if (inchar == EOF)
- more_subdata = pcdata_ft = FALSE;
- else
- if (inchar == ']')
- if ((inchar=our_fgetc(indoc)) == ']')
- if ((inchar=our_fgetc(indoc)) == MARKUP_END) {
- if (--num_open_ms < 0)
- ourexit(2,"\nError: Marked section end outside of declaration.\n");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
- our_ungetc(inchar,indoc);
- (*applic)(DATA_STG,"]]","");
- (*print_ctr)(ctrfp,"]]");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
- our_ungetc(inchar,indoc);
- (*put_ctr)(']',ctrfp);
- (*applic)(DATA_STG,"]","");
- }
- else
- if (inchar == '<')
- if ((inchar=our_fgetc(indoc)) == '/')
- if (isalpha(inchar=our_fgetc(indoc))) {
- more_subdata = pcdata_ft = FALSE;
- our_ungetc(inchar,indoc);
- unget_string("</");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
- (*print_ctr)(ctrfp,"</");
- (*applic)(DATA_STG,"</","");
- our_ungetc(inchar,indoc);
- }
- else /* found markup */
- if (inchar=='?' || isalpha(inchar)) {
- more_subdata = pcdata_ft = FALSE;
- our_ungetc(inchar,indoc);
- our_ungetc('<',indoc);
- }
- else
- if (inchar == '!')
- retval = check_for_mdo(&more_subdata,&num_cr,cr_found,&pcdata_ft);
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
- (*put_ctr)('<',ctrfp);
- (*applic)(DATA_STG,"<","");
- our_ungetc(inchar,indoc);
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
- try_entref(inchar,genthead,FALSE,&pcdata_ft);
- }
- }
- check_cr(&num_cr,cr_found,&pcdata_ft,TRUE);
- }
- else
- if (find_except(currincl,token) && !find_except(currexcl,token)) {
- newcm = pushcreate(tp);
- if ((retval = traverse(newcm,tp,genthead,penthead,&pcdata_ft)) == NFSH) {
- if (tknretval == TEXT)
- sprintf(error_msg,"%s%s%s","\nError: Invalid data, last opened element '",tp->nametoken,"'.\n");
- else
- sprintf(error_msg,"%s%s%s","\nError: Invalid tag, last opened element '",tp->nametoken,"'.\n");
- FATAL_ERROR()
- }
-
- /* check to make sure the element has content */
- if (EMPTY_CONTENT(newcm)) { /* can't have endtag for EMPTY */
- token2 = token | HIGHBIT;
- putstr_outbuf("\n[/");
- putstr_outbuf(tp->nametoken);
- putstr_outbuf("]");
- place_in_queue(END_TAG_NAME,tp->nametoken,"");
- }
- else
- tknretval = gettoken(&tp,&token2,genthead,penthead,&pcdata_ft); /* must be end tag */
- if (IS_STARTTAG(token2) || IS_ENDTAG_NOTEQ(token2,token))
- resolve_endtag(tp->cmptr,token2,tp,&retval,tknretval,genthead,penthead,token);
- popfree(newcm); /* through with this content model */
- }
- else {
- more_pcdata = FALSE;
- ungettoken(token,tp);
- }
- if (more_pcdata)
- tknretval = gettoken(&tp,&token,genthead,penthead,&pcdata_ft);
- }
- if (retval==FOUND && !cr_found)
- (*put_ctr)('|',ctrfp);
- free(outstr);
- return(retval);
- }
-
- /*------------------------------------------------------*/
- /* G E T R C D A T A */
- /* Reads 'indoc' for replaceable character data. */
- /* Entity references are resolved normally. */
- /* RCDATA is terminated by an etago, delimiter */
- /* in context. That is a '</' followed by a name */
- /* start character. */
- /* */
- /* returns -- NFDHT, FOUND */
- /*------------------------------------------------------*/
- STATUS getrcdata(genthead,look_for_endtag,firsttime,end_of_data)
- ENTITYDESC *genthead;
- BOOLEAN look_for_endtag,*firsttime,end_of_data;
- {
- int inchar,
- prev_entitylevel;
- unsigned num_cr;
- BOOLEAN
- more_rcdata,
- same_entity,
- cr_found,
- rcdata_ft;
- STATUS retval;
-
- more_rcdata = same_entity = TRUE;
- flush_buf();
- retval = NFDHT;
- num_cr = 0;
- rcdata_ft = *firsttime;
-
- while(more_rcdata && (inchar=our_fgetc(indoc))!=EOF) {
- cr_found = save_crs(&num_cr,&inchar);
- if (inchar=='<' && look_for_endtag)
- if ((inchar=our_fgetc(indoc)) == '/')
- if (isalpha(inchar=our_fgetc(indoc)) && same_entity) {
- more_rcdata = FALSE;
- our_ungetc(inchar,indoc); /* unget in reverse order */
- unget_string("</");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
- (*print_ctr)(ctrfp,"</");
- (*applic)(DATA_STG,"</","");
- our_ungetc(inchar,indoc);
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
- (*put_ctr)('<',ctrfp);
- (*applic)(DATA_STG,"<","");
- our_ungetc(inchar,indoc);
- }
- else
- if (inchar==']' && !look_for_endtag)
- if ((inchar=our_fgetc(indoc)) == ']')
- if ((inchar=our_fgetc(indoc))==MARKUP_END && same_entity) {
- more_rcdata = open_rcdata_ms = FALSE;
- unget_string("]]>");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
- (*print_ctr)(ctrfp,"]]");
- (*applic)(DATA_STG,"]]","");
- our_ungetc(inchar,indoc);
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
- (*print_ctr)(ctrfp,"]");
- (*applic)(DATA_STG,"]","");
- our_ungetc(inchar,indoc);
- }
- else
- if (inchar == OUR_EE) {
- if (entstack[--entitylevel] != lookstack())
- ourexit(2,"\nError: Entity End occurred in different replaceable character data.\n");
- check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
- same_entity = TRUE;
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
- prev_entitylevel = entitylevel;
- try_entref(inchar,genthead,FALSE,&rcdata_ft);
- same_entity = (prev_entitylevel == entitylevel) ? TRUE : FALSE;
- }
- }
- if (retval==FOUND && end_of_data)
- (*put_ctr)('|',ctrfp);
- return(retval);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ M A R K E D _ S E C T I O N */
- /* This routine processes a marked section. If */
- /* the section is an INCLUDE section, processing */
- /* is returned to gettoken, else the entire section */
- /* is processed and then control is returned. */
- /*------------------------------------------------------*/
- void get_marked_section(penthead)
- ENTITYDESC *penthead;
- {
- register int inchar,statkey;
- int begnum_open;
- BOOLEAN moredata,close_read=FALSE;
-
- if ((inchar=our_fgetc(indoc)) != '[')
- ourexit(2,"\nError: DSO not found in marked section.\n");
-
- statkey = get_status_keyword(penthead);
- while(inputps(penthead) > 0)
- gettilnosep();
- if ((inchar=our_fgetc(indoc)) != '[')
- ourexit(2,"\nError: DSO not found in marked section.\n");
-
- switch(statkey) {
- case MS_INCLUDE:
- break;
- case MS_CDATA:
- open_cdata_ms = TRUE;
- break;
- case MS_RCDATA:
- open_rcdata_ms = TRUE;
- break;
- case MS_IGNORE:
- begnum_open = num_open_ms-1; /* already incremented */
- moredata = TRUE;
- while(moredata && (inchar=our_fgetc(indoc))!=EOF)
- if (inchar=='<' && (inchar=our_fgetc(indoc))=='!' && (inchar=our_fgetc(indoc))=='[') {
- if (++num_open_ms > TAGLVL)
- ourexit(2,"\nError: Number open marked sections > TAGLVL.\n");
- }
- else
- if (inchar==']' && (inchar=our_fgetc(indoc))==']' && (inchar=our_fgetc(indoc))==MARKUP_END) {
- if (--num_open_ms == begnum_open)
- moredata = FALSE;
- }
- else
- if (inchar == OUR_EE)
- ourexit(2,"\nError: Entity End found in IGNORE marked section.\n");
- STRIP_CRs();
- close_read = TRUE;
- break;
- default:
- software_fault();
- break;
- }
- if (!close_read && statkey!=MS_INCLUDE && statkey!=MS_CDATA && statkey!=MS_RCDATA) {
- if ((inchar=our_fgetc(indoc))!=']' || (inchar=our_fgetc(indoc))!=']')
- ourexit(2,"\nError: MDO not found in marked section.\n");
- if ((inchar=our_fgetc(indoc)) != MARKUP_END)
- ourexit(2,"\nError: MDC not found in marked section.\n");
- }
- return;
- }
-
- /*------------------------------------------------------*/
- /* G E T _ N A M E */
- /* Reads from the input document for a valid */
- /* SGML name. An error condition is raised if */
- /* the length of the name is greater than NAMELEN. */
- /*------------------------------------------------------*/
- get_name(name,capitalize)
- char name[];
- int (*capitalize)();
- {
- int inchar,indx;
-
- memset(name,'\0',NAMELEN+1);
- indx=0;
- if (isalpha(inchar=our_fgetc(indoc))) {
- putchar_outbuf(name[indx++]=(*capitalize)(inchar));
- fillup(name,&indx,capitalize);
- if (indx > NAMELEN) {
- sprintf(error_msg,"%s%s%s","\nError: Length of name beginning '",name,"' > NAMELEN\n");
- FATAL_ERROR()
- }
- }
- else {
- name[indx++] = (*capitalize)(inchar);
- fillup(name,&indx,capitalize);
- sprintf(error_msg,"%s%s%s","\nError: Name '",name,"' must start with name start character\n");
- FATAL_ERROR()
- }
- return(indx);
- }
-
- /*------------------------------------------------------*/
- /*------------------------------------------------------*/
- /*------------------------------------------------------*/
- get_entname(name,capitalize)
- char name[];
- int (*capitalize)();
- {
- int inchar,indx;
-
- memset(name,'\0',NAMELEN+1);
- indx=0;
- if (isalpha(inchar=our_fgetc(indoc))) {
- name[indx++] = (*capitalize)(inchar);
- fillup2(name,&indx,capitalize);
- if (indx > NAMELEN) {
- sprintf(error_msg,"%s%s%s","\nError: Length of name beginning '",name,"' > NAMELEN\n");
- FATAL_ERROR()
- }
- }
- else {
- name[indx++] = (*capitalize)(inchar);
- fillup2(name,&indx,capitalize);
- sprintf(error_msg,"%s%s%s","\nError: Name '",name,"' must start with name start character\n");
- FATAL_ERROR()
- }
- return(indx);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ N U T O K E N */
- /* Reads from the input document for a valid */
- /* SGML nutoken. An error condition is raised */
- /* if the length of the nutoken is greater than */
- /* NAMELEN. */
- /*------------------------------------------------------*/
- get_nutoken(nutoken,capitalize)
- char nutoken[];
- int (*capitalize)();
- {
- int inchar,indx;
-
- memset(nutoken,'\0',NAMELEN+1);
- indx=0;
- inchar=our_fgetc(indoc);
- if (isdigit(inchar)) { /* nutoken must start with numeral */
- putchar_outbuf(nutoken[indx++]=(*capitalize)(inchar));
- fillup(nutoken,&indx,capitalize);
- if (indx > NAMELEN) {
- sprintf(error_msg,"%s%s%s","\nError: Length of nutoken beginning '",nutoken,"' > NAMELEN\n");
- FATAL_ERROR()
- }
- }
- else {
- nutoken[indx++] = (*capitalize)(inchar);
- fillup(nutoken,&indx,capitalize);
- sprintf(error_msg,"%s%s%s","\nError: Nutoken '",nutoken,"' must start with numeral.\n");
- FATAL_ERROR()
- }
- return(indx);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ N M T O K E N */
- /* Reads from the input document for a valid */
- /* SGML nmtoken. An error condition is raised */
- /* if the length of the nmtoken is greater than */
- /* NAMELEN. */
- /*------------------------------------------------------*/
- get_nmtoken(nmtoken,capitalize)
- char nmtoken[];
- int (*capitalize)();
- {
- int indx=0;
-
- memset(nmtoken,'\0',NAMELEN+1);
- fillup(nmtoken,&indx,capitalize);
- if (indx > NAMELEN) {
- sprintf(error_msg,"%s%s%s","\nError: Length of nmtoken beginning '",nmtoken,"' > NAMELEN\n");
- FATAL_ERROR()
- }
- return(indx);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ N U M B E R */
- /* Reads from the input document for a valid */
- /* SGML number. An error condition is raised */
- /* if the length of the number is greater than */
- /* NAMELEN. */
- /*------------------------------------------------------*/
- get_number(number,capitalize)
- char number[];
- int (*capitalize)();
- {
- int indx=0;
- memset(number,'\0',NAMELEN+1);
- while(isdigit(number[indx]=our_fgetc(indoc)) && indx<=NAMELEN)
- putchar_outbuf(number[indx++]);
- if (indx > NAMELEN) {
- sprintf(error_msg,"%s%s%s","\nError: Length of number beginning '",number,"' > NAMELEN.\n");
- FATAL_ERROR()
- }
- if (indx == 0) {
- sprintf(error_msg,"%s%s%s","\nError: Invalid number, found '",number,"'.\n");
- FATAL_ERROR()
- }
- our_ungetc(number[indx],indoc);
- return(indx);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ P I */
- /* This routine reads a processing instruction */
- /* from 'indoc'. No parsing is done on the p.i. */
- /* The p.i. is terminated by a TAGC. The output */
- /* buffer has already been flushed, therefore */
- /* the p.i. is dumped directly into 'outdoc'. */
- /*------------------------------------------------------*/
- void get_pi()
- {
- register int inchar,pi_length;
- char outpi[PILEN+1];
-
- pi_length = 0;
- while((inchar=our_fgetc(indoc))!=MARKUP_END && inchar!=EOF && PILEN>pi_length) {
- if (inchar == OUR_EE)
- ourexit(2,"\nError: EE is invalid in processing instruction.\n");
- else
- (*put_ctr)(inchar,ctrfp);
- outpi[pi_length++] = inchar;
- }
- (*put_ctr)(']',ctrfp);
- STRIP_CRs();
- if (pi_length > PILEN)
- ourexit(2,"\nError: Length of processing instruction > PILEN.\n");
- outpi[pi_length] = '\0';
- (*applic)(PROC_INST,outpi,"");
- return;
- }
-
- /*------------------------------------------------------*/
- /* G E T _ S T A T U S _ K E Y W O R D */
- /* This routine will parse the status keyword */
- /* specification of a marked secttion declaration. */
- /* Zero or more status keywords are allowed in */
- /* the specification. If none is specified, the */
- /* default of INCLUDE is returned. If multiple */
- /* keywords are defined, the following priority */
- /* is used (highest shown first): */
- /* "IGNORE" */
- /* "CDATA" */
- /* "RCDATA" */
- /* "INCLUDE" */
- /*------------------------------------------------------*/
- get_status_keyword(penthead)
- ENTITYDESC *penthead;
- {
- int inchar,retval;
- char keyname[NAMELEN+1];
-
- retval = MS_INCLUDE; /* if none are specified, INCLUDE is assumed */
- while(inputps(penthead) > 0);
- gettilnosep();
-
- while((inchar=our_fgetc(indoc)) != '[') {
- our_ungetc(inchar,indoc);
- get_entname(keyname,our_toupper);
- if (strcmp(keyname,"IGNORE") == 0)
- retval = MAX(MS_IGNORE,retval);
- else
- if (strcmp(keyname,"CDATA") == 0)
- retval = MAX(MS_CDATA,retval);
- else
- if (strcmp(keyname,"RCDATA") == 0)
- retval = MAX(MS_RCDATA,retval);
- else
- if (strcmp(keyname,"INCLUDE") == 0)
- retval = MAX(MS_INCLUDE,retval);
- else
- if (strcmp(keyname,"TEMP") != 0)
- ourexit(2,"\nError: Illegal status keyword in marked section\n");
- while(inputps(penthead) > 0);
- gettilnosep();
- }
- our_ungetc(inchar,indoc);
- return(retval);
- }
-
- /*--------------------------------------------------------------*/
- /* G E T T O K E N */
- /* This routine attempts at all costs to get a tag from */
- /* the document. If a tag has already been read and */
- /* "ungettoken"d, then that tag is returned. If not, */
- /* then parsing continues eating up all comments and */
- /* processing instructions. Marked sections are opened */
- /* and processed as far as possible, meaning until data */
- /* is found. */
- /*--------------------------------------------------------------*/
- TKNRETVAL gettoken(tp,token,genthead,penthead,get_ft)
- int *token;
- STPTR *tp;
- ENTITYDESC *genthead,*penthead;
- BOOLEAN *get_ft;
- {
- char genid[NAMELEN+1]; /* generic identifier read from indoc */
- int curr_delim, /* current delimiter working with */
- inchar, /* current input character */
- open_token;
- unsigned
- nleng_spec_list, /* normalized length of specification list */
- num_id_idref; /* number of ID and IDREF attribute values */
- STENTRY *opened_tp;
- TKNRETVAL retval; /* either MARKUP_FOUND or TEXT */
-
- nleng_spec_list = num_id_idref = 0;
- curr_delim = PIO;
-
- if (state == GETNEW) {
- while(curr_delim==PIO || (curr_delim==MDO && !open_cdata_ms && !open_rcdata_ms)) { /* get input from input document */
- flush_buf();
- open_token = ((opened_tp=lookstack()) == NULL) ? rootid : opened_tp->tokenid;
- if (num_open_ms > 0)
- get_ms_closes();
- if (symtable[open_token].content_type==ELEMENT_CONTENT &&
- !open_cdata_ms && !open_rcdata_ms) {
- while ((inchar=our_fgetc(indoc))=='&' || inchar==RE || inchar==RS ||
- inchar==SEPCHAR || inchar==SPACE || inchar==OUR_EE) {
- try_entref(inchar,genthead,TRUE,&dontcare);
- gettilnosep();
- }
- our_ungetc(inchar,indoc);
- }
- if (num_open_ms > 0)
- get_ms_closes();
-
- switch(curr_delim=getdelim()) {
- case NODELIM:
- case EOF:
- retval = TEXT; /* if find EOF, just assume it was TEXT */
- break;
- case PIO:
- retval = TEXT; /* just an assumption */
- flush_buf(); /* flush delimiter out */
- *get_ft = TRUE;
- get_pi();
- break;
- case MDO:
- retval = TEXT; /* just an assumption */
- inchar = our_fgetc(indoc);
- our_ungetc(inchar,indoc);
- if (inchar == MARKUP_END) { /* null comment */
- CLEAR_BUF(); /* clear out MDO */
- if ((inchar=our_fgetc(indoc)) != MARKUP_END)
- ourexit(2,"\nError: MDO not found for comment declaration\n");
- STRIP_CRs();
- }
- else
- if (inchar == '-') { /* regular comment */
- CLEAR_BUF(); /* clear out MDO */
- while(inputps(penthead) > 0);
- if ((inchar=our_fgetc(indoc)) != MARKUP_END)
- ourexit(2,"\nError: MDO not found for comment declaration\n");
- STRIP_CRs();
- }
- else {
- CLEAR_BUF(); /* flush delimiter out */
- if (++num_open_ms > TAGLVL)
- ourexit(2,"\nError: Number of open marked sections > TAGLVL\n");
- get_marked_section(penthead);
- }
- break;
- case ETAGO:
- get_name(genid,our_toupper);
- sprintf(lastread_tag,"</%s>",genid);
- place_in_queue(END_TAG_NAME,genid,"");
- retval = MARKUP_FOUND;
- if ((*tp=(STPTR)bsearch(genid,symtable,numsym,sizeof(STENTRY),compare)) != NULL)
- *token = (*tp)->tokenid;
- else {
- sprintf(error_msg,"%s%s%s","\nError: Unknown generic identifier '",genid,"' in endtag.\n");
- FATAL_ERROR()
- }
- *token |= HIGHBIT; /* turn high bit on for end tags */
- gettilnosep();
- if ((inchar=our_fgetc(indoc)) != MARKUP_END) {
- sprintf(error_msg,"%s%s%s","\nError: TAGC not found for '",genid,"'.\n");
- FATAL_ERROR()
- }
- putchar_outbuf(']'); /* TAGC to buffer */
- STRIP_CRs();
- *get_ft = TRUE;
- break;
- case STAGO:
- retval = get_starttag(token,tp,genthead,get_ft,&nleng_spec_list,&num_id_idref);
- break;
- default:
- software_fault();
- } /*switch*/
- } /*while*/
- }
- else { /* get input from intermediate source, i.e. after ungettoken */
- state = GETNEW; /* next time get from document */
- *token = holdtoken;
- *tp = holdtp;
- retval = MARKUP_FOUND;
- }
- if (nleng_spec_list > ATTSPLEN)
- ourexit(2,"\nError: Normalized length of attribute spec list > ATTSPLEN\n");
- if (num_id_idref > GRPCNT)
- ourexit(2,"\nError: Total number of id reference names > GRPCNT.\n");
- return(retval);
- }
-
- /*--------------------------------------------------------------*/
- /* G E T _ S T A R T T A G */
- /* This routine handles the processing of a start tag. */
- /* First the name of the tag is read and then a search */
- /* is made to ensure that the name is a valid generic */
- /* identifier. The attributes and their values are */
- /* then read in and verified one at a time. */
- /*--------------------------------------------------------------*/
- TKNRETVAL get_starttag(token,tp,genthead,get_ft,nleng_spec_list,num_id_idref)
- int *token;
- STPTR *tp;
- ENTITYDESC *genthead;
- BOOLEAN *get_ft;
- unsigned *nleng_spec_list,*num_id_idref;
- {
- char genid[NAMELEN+1], /* generic identifier read from indoc */
- attrname[NAMELEN+1]; /* name of attribute value */
- int inchar, /* current input character */
- leng, /* length of attribute name */
- temp_bufptr,
- tagsize; /* current length of tag */
- ATTRDESC *thisadp; /* points to description of attribute */
- BOOLEAN notat_specified;
- TKNRETVAL retval; /* either MARKUP_FOUND or TEXT */
-
- notat_specified = FALSE;
- tagsize = get_name(genid,our_toupper);
- sprintf(lastread_tag,"<%s>",genid);
- place_in_queue(TAG_NAME,genid,"");
- retval = MARKUP_FOUND;
- if ((*tp=(STPTR)bsearch(genid,symtable,numsym,sizeof(STENTRY),compare)) != NULL)
- *token = (*tp)->tokenid;
- else {
- sprintf(error_msg,"%s%s%s","\nError: Unknown generic identifier '",genid,"'.\n");
- FATAL_ERROR()
- }
- (*tp)->cmptr->contref_attr = FALSE;
- unprocess((*tp)->adptr);
- tagsize += gettilnosep();
- temp_bufptr = bufptr;
- while((inchar=our_fgetc(indoc)) != MARKUP_END) {
- our_ungetc(inchar,indoc);
- putchar_outbuf(' ');
- leng = get_name(attrname,our_toupper);
- *nleng_spec_list += leng+NORMSEP;
- tagsize += leng;
- if ((thisadp=find_attr(attrname,(*tp)->adptr)) == NULL) {
- sprintf(error_msg,"%s%s%s","\nError: Unknown attribute name'",attrname,"'.\n");
- FATAL_ERROR()
- }
- else
- if (thisadp->processed == TRUE) {
- sprintf(error_msg,"%s%s%s","\nError: Duplicate attribute specifications '",thisadp->attrname,"'.\n");
- FATAL_ERROR()
- }
- else
- *nleng_spec_list += get_attrvalue(thisadp,genthead,&tagsize,&((*tp)->cmptr->contref_attr),&(notat_specified));
- tagsize += gettilnosep();
- }
- bufptr = temp_bufptr;
- if (req_not_proc((*tp)->adptr) == TRUE) {
- sprintf(error_msg,"%s%s%s","\nError: REQUIRED or CURRENT attribute not specified '",
- (*tp)->adptr->attrname,"'.\n");
- FATAL_ERROR()
- }
- if (tagsize > TAGLEN)
- ourexit(3,"\nLength of undelimited start tag > TAGLEN.\n");
- *num_id_idref += resolve_attr((*tp)->adptr,FALSE);
- place_in_queue(TAG_END,"","");
- if ((*tp)->adptr == NULL)
- putchar_outbuf(']');
- else
- putstr_outbuf("\n]");
- STRIP_CRs();
- *get_ft = FALSE;
- return(retval);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ A T T R V A L U E */
- /* This routine processes the attribute value part */
- /* of an attribute specification. The value is */
- /* checked for correctness in terms of syntax as */
- /* well as semantics. */
- /*------------------------------------------------------*/
- get_attrvalue(thisadp,genthead,taglen,contref,notat_specified)
- ATTRDESC *thisadp;
- ENTITYDESC *genthead;
- int *taglen;
- BOOLEAN *contref,*notat_specified;
- {
- char name[NAMELEN+1],
- buffer[ATTSPLEN+1],
- idrefname[NAMELEN+1],
- idname[NAMELEN+1];
- unsigned
- length,
- nleng_attrval,
- num_csdata;
- register int inchar;
- int delim,
- val,
- (*getone)();
- BOOLEAN more_attr_vals;
- GROUPDESC *groupptr;
-
- (*taglen) += gettilnosep()+1;
- length = 0;
- nleng_attrval = NORMSEP;
- thisadp->processed = TRUE;
- if ((inchar=our_fgetc(indoc)) != '=') /* name has already been read */
- ourexit(2,"\nError: Invalid value indicator in attribute specification.\n");
- putchar_outbuf('=');
- (*taglen) += gettilnosep() + 1;
-
- if ((delim=our_fgetc(indoc))==LITA || delim==LIT)
- putchar_outbuf(delim);
- else
- ourexit(2,"\nError: LIT or LITA not specified in attribute specification\n");
-
- BLANK(buffer,ATTSPLEN+1);
-
- switch(thisadp->dvcode) {
- case NAME:
- case NAMES:
- case NOTATION:
- getone = get_name;
- break;
- case NUMBER:
- case NUMBERS:
- getone = get_number;
- break;
- case NMTOKEN:
- case NMTOKENS:
- case GROUP:
- getone = get_nmtoken;
- break;
- case NUTOKEN:
- case NUTOKENS:
- getone = get_nutoken;
- break;
- }
-
- (*taglen) += process_attr(buffer,delim,genthead,thisadp->dvcode,&num_csdata);
- nleng_attrval += num_csdata*NORMSEP;
- (*taglen)++; /* close delimiter */
-
- if (thisadp->dvcode != ENUM_CDATA) {
- unget_string(buffer);
- gettilnosep();
- }
-
- if (thisadp->defcode == A_CONREF) {
- if (*notat_specified == TRUE)
- ourexit(2,"\nError: Content reference attribute not allowed with notation attribute.\n");
- *contref = TRUE;
- }
-
- more_attr_vals = TRUE;
-
- switch(thisadp->dvcode) {
- case ENUM_CDATA:
- putstr_outbuf(buffer);
- nleng_attrval += strlen(buffer);
- length = strlen(buffer);
- get_close(delim);
- check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
- break;
- case GROUP:
- case NOTATION:
- length = (*getone)(name,our_toupper); /* GROUP is actually a nmtoken */
- if (thisadp->dvcode == NOTATION) {
- if (*contref == TRUE) {
- sprintf(error_msg,"%s%s%s","\nError: Notation attribute '",name,"' specified after content reference.\n");
- FATAL_ERROR()
- }
- *notat_specified = TRUE;
- }
- nleng_attrval += length+NORMSEP;
-
- /* value must have been defined as part of the group */
- if ((groupptr=find_group(name,thisadp->groupp)) == NULL) {
- sprintf(error_msg,"%s'%s'.\n","\nError: Unknown attribute group member ",name);
- FATAL_ERROR()
- }
- else
- thisadp->u2.currgrp = groupptr;
- get_close(delim);
- check_fixed(thisadp->defcode,name,thisadp->u2.currgrp->groupname,NAMELEN);
- break;
- case NAME:
- case NMTOKEN:
- case NUTOKEN:
- case NUMBER:
- length = (*getone)(buffer,our_toupper,FALSE);
- nleng_attrval += length+NORMSEP;
- get_close(delim);
- check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
- break;
- case NUMBERS:
- case NAMES:
- case NMTOKENS:
- case NUTOKENS:
- while(more_attr_vals) { /* process each attribute value */
- /* in list, one at a time */
- val = (*getone)(buffer+length,our_toupper);
- nleng_attrval += NORMSEP + val;
- length += val;
- if (gettilnosep() != 0)
- length++;
- more_attr_vals = ((inchar=our_fgetc(indoc)) != delim);
- if (inchar != delim) {
- putchar_outbuf(' ');
- our_ungetc(inchar,indoc);
- }
- }
- putchar_outbuf(inchar);
- check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
- break;
- case ID:
- length = get_name(buffer,our_toupper);
- nleng_attrval += length+NORMSEP;
- strcpy(idname,buffer);
- get_close(delim);
- check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
- break;
- case IDREF:
- length = get_name(buffer,our_toupper);
- nleng_attrval += length+NORMSEP;
- strcpy(idrefname,buffer);
- get_close(delim);
- check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
- break;
- case IDREFS:
- while(more_attr_vals) { /* process each attribute value */
- /* of the list one at a time */
- val = get_name(buffer+length,our_toupper);
- strncpy(idrefname,buffer+length,val);
- length += val;
- nleng_attrval += NORMSEP+val;
- if (gettilnosep() != 0)
- length++;
- more_attr_vals = ((inchar=our_fgetc(indoc)) != delim);
- if (inchar != delim) {
- putchar_outbuf(' ');
- our_ungetc(inchar,indoc);
- }
- }
- putchar_outbuf(inchar);
- check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
- break;
- case ENTITY:
- length = get_name(name,nullfnc);
- nleng_attrval += length+NORMSEP;
- if (find_entity(genthead,name,FALSE) == NULL) {
- sprintf(error_msg,"%s'%s'.\n","\nError: Unknown attribute general entity name ",name);
- FATAL_ERROR()
- }
- check_fixed(thisadp->defcode,name,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
- get_close(delim);
- break;
- default:
- software_fault();
- break;
- }
- if (nleng_attrval > LITLEN)
- ourexit(2,"\nError: Normalized length of attribute value > LITLEN\n");
- if (thisadp->dvcode!=NOTATION && thisadp->dvcode!=GROUP) {
- if (thisadp->u2.currdef != NULL)
- free(thisadp->u2.currdef);
- thisadp->u2.currdef = get_char_mem(length+1);
- buffer[length] = '\0';
- strcpy(thisadp->u2.currdef,buffer);
- }
- return(nleng_attrval);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ C D A T A _ M S */
- /*------------------------------------------------------*/
- STATUS get_cdata_ms(firsttime)
- BOOLEAN *firsttime;
- {
- BOOLEAN moredata,cr_found,cdata_ms_ft;
- STATUS retval;
- int inchar;
- unsigned num_cr;
- char *outstr;
-
- flush_buf();
- retval = NFDHT;
- cdata_ms_ft = *firsttime;
- outstr = get_char_mem(2);
- moredata = TRUE;
- while(moredata && (inchar=our_fgetc(indoc))!=EOF) {
- cr_found = save_crs(&num_cr,&inchar);
- if (inchar == ']')
- if ((inchar=our_fgetc(indoc)) == ']')
- if ((inchar=our_fgetc(indoc)) == MARKUP_END) {
- moredata = FALSE;
- our_ungetc(MARKUP_END,indoc);
- unget_string("]]");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE);
- (*print_ctr)(ctrfp,"]]%c",inchar);
- (*applic)(DATA_STG,"]]","");
- *outstr = inchar;
- (*applic)(DATA_STG,outstr,"");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE);
- (*print_ctr)(ctrfp,"]%c",inchar);
- (*applic)(DATA_STG,"]","");
- *outstr = inchar;
- (*applic)(DATA_STG,outstr,"");
- }
- else {
- retval = FOUND;
- check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE);
- (*put_ctr)(inchar,ctrfp);
- *outstr = inchar;
- (*applic)(DATA_STG,outstr,"");
- }
- }
- open_cdata_ms = FALSE;
- free(outstr);
- return(retval);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ C L O S E */
- /* This routine reads from 'indoc' for the */
- /* delimeter passed to it as a parameter. If */
- /* the delimeter is not found, an error is raised. */
- /*------------------------------------------------------*/
- void get_close(delim)
- int delim;
- {
- int inchar;
- if ((inchar=our_fgetc(indoc)) != delim)
- ourexit(2,"\nError: Lit or lita delimeter not found in attribute literal.\n");
- else
- putchar_outbuf(inchar);
- return;
- }
-
- /*------------------------------------------------------*/
- /* G E T T I L N O S E P */
- /* This routine reads from the file until a */
- /* non-seperator is found. */
- /*------------------------------------------------------*/
- gettilnosep()
- {
- register int indx;
- int inchar;
-
- indx = 0;
- /* notice we aren't writing unneeded seperators to output file */
- inchar=our_fgetc(indoc); /* get character from file */
- while(SEPERATOR(inchar)) {
- inchar=our_fgetc(indoc);
- indx++;
- }
- our_ungetc(inchar,indoc);
- return(indx);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ C H A R _ M E M */
- /* This routine allocates memory for character */
- /* data and raises an error condition if there */
- /* is insufficient memory for the allocation. */
- /*------------------------------------------------------*/
- char *get_char_mem(number)
- int number;
- {
- char *retptr,*calloc();
- if ((retptr=calloc(number,sizeof(char))) == NULL)
- ourexit(2,"\nInsufficient memory in parse3\n");
- return(retptr);
- }
-
- /*------------------------------------------------------*/
- /* G E T _ M S _ C L O S E S */
- /* This routine reads from 'indoc' as many */
- /* marked section closes as possible. */
- /*------------------------------------------------------*/
- void get_ms_closes()
- {
- int inchar,open_token;
- STENTRY *opened_tp;
- BOOLEAN more_ms_closes=TRUE;
-
- while(more_ms_closes && (inchar=our_fgetc(indoc))!=EOF) {
- if (inchar == ']')
- if ((inchar=our_fgetc(indoc)) == ']')
- if ((inchar=our_fgetc(indoc)) == MARKUP_END) {
- if (--num_open_ms == 0)
- more_ms_closes = FALSE;
- if ((inchar=our_fgetc(indoc)) != OUR_EE)
- our_ungetc(inchar,indoc);
- }
- else {
- our_ungetc(inchar,indoc);
- unget_string("]]");
- more_ms_closes = FALSE;
- }
- else {
- our_ungetc(inchar,indoc);
- our_ungetc(']',indoc);
- more_ms_closes = FALSE;
- }
- else {
- our_ungetc(inchar,indoc);
- more_ms_closes = FALSE;
- }
- open_token = ((opened_tp=lookstack()) == NULL) ? rootid : opened_tp->tokenid;
-
- if (symtable[open_token].content_type == ELEMENT_CONTENT)
- gettilnosep(); /* seperators are allowed between tags */
- }
- return;
- }
-